Library Requirements¶
In [1]:
!pip install pydeck
Requirement already satisfied: pydeck in /usr/local/lib/python3.11/dist-packages (0.9.1) Requirement already satisfied: jinja2>=2.10.1 in /usr/local/lib/python3.11/dist-packages (from pydeck) (3.1.5) Requirement already satisfied: numpy>=1.16.4 in /usr/local/lib/python3.11/dist-packages (from pydeck) (1.26.4) Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.11/dist-packages (from jinja2>=2.10.1->pydeck) (3.0.2)
In [2]:
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import seaborn as sns
import pydeck as pdk
from shapely.geometry import Point, MultiPoint
import statsmodels.formula.api as smf
Pre-Processing¶
- Read the CSV file and explore the datasets.
- Remove NA values.
- Convert the column from object to boolean type of variable or better to read.
In [3]:
# Read the CSV files
df = pd.read_csv("real-estate-data.csv")
df.head()
Out[3]:
| id_ | ward | beds | baths | DEN | size | parking | exposure | D_mkt | building_age | maint | price | lt | lg | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 219129 | W13 | 3.0 | 3 | No | 1500-1999 sqft | N | No | 16.0 | 9 | 1087.0 | 1821000.0 | 43.617997 | -79.392383 |
| 1 | 757581 | W13 | 1.0 | 1 | YES | 500-999 sqft | Yes | We | 23.0 | 3 | 469.0 | 613000.0 | 43.648968 | -79.390031 |
| 2 | 404196 | W13 | 2.0 | 2 | YES | 500-999 sqft | Yes | We | 12.0 | 1 | 767.0 | 838000.0 | 43.641045 | -79.375436 |
| 3 | 821441 | W13 | 2.0 | 2 | No | 500-999 sqft | Yes | S | 10.0 | 25 | 827.0 | 935000.0 | 43.642122 | -79.370250 |
| 4 | 612090 | W10 | 2.0 | 1 | No | NaN | N | S | 5.0 | 1 | NaN | 1328000.0 | 43.692210 | -79.365015 |
In [4]:
# Read the Toronto neighbourhood GeoJSON files
# Load the GeoJSON file into a GeoDataFrame
neighborhoods = gpd.read_file("Neighbourhoods - 4326.geojson")
neighborhoods.head()
Out[4]:
| _id | AREA_ID | AREA_ATTR_ID | PARENT_AREA_ID | AREA_SHORT_CODE | AREA_LONG_CODE | AREA_NAME | AREA_DESC | CLASSIFICATION | CLASSIFICATION_CODE | OBJECTID | geometry | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 2502366 | 26022881 | 0 | 174 | 174 | South Eglinton-Davisville | South Eglinton-Davisville (174) | Not an NIA or Emerging Neighbourhood | NA | 17824737.0 | MULTIPOLYGON (((-79.38635 43.69783, -79.38623 ... |
| 1 | 2 | 2502365 | 26022880 | 0 | 173 | 173 | North Toronto | North Toronto (173) | Not an NIA or Emerging Neighbourhood | NA | 17824753.0 | MULTIPOLYGON (((-79.39744 43.70693, -79.39837 ... |
| 2 | 3 | 2502364 | 26022879 | 0 | 172 | 172 | Dovercourt Village | Dovercourt Village (172) | Not an NIA or Emerging Neighbourhood | NA | 17824769.0 | MULTIPOLYGON (((-79.43411 43.66015, -79.43537 ... |
| 3 | 4 | 2502363 | 26022878 | 0 | 171 | 171 | Junction-Wallace Emerson | Junction-Wallace Emerson (171) | Not an NIA or Emerging Neighbourhood | NA | 17824785.0 | MULTIPOLYGON (((-79.4387 43.66766, -79.43841 4... |
| 4 | 5 | 2502362 | 26022877 | 0 | 170 | 170 | Yonge-Bay Corridor | Yonge-Bay Corridor (170) | Not an NIA or Emerging Neighbourhood | NA | 17824801.0 | MULTIPOLYGON (((-79.38404 43.64497, -79.38502 ... |
In [5]:
# Shape of the dataset and data types
print(df.shape)
print(df.dtypes)
(3042, 14) id_ int64 ward object beds float64 baths int64 DEN object size object parking object exposure object D_mkt float64 building_age int64 maint float64 price float64 lt float64 lg float64 dtype: object
In [6]:
# Count missing values for each column
missing_values = df.isna().sum()
print(missing_values)
id_ 0 ward 0 beds 54 baths 0 DEN 0 size 53 parking 0 exposure 0 D_mkt 93 building_age 0 maint 45 price 61 lt 0 lg 0 dtype: int64
In [7]:
# Calculate which neighbourhood a unit belongs to.
# Create a geometry column from the longitude and latitude values:
df['geometry'] = df.apply(lambda row: Point(row['lg'], row['lt']), axis=1)
# Convert the DataFrame to a GeoDataFrame with WGS84 CRS
gdf = gpd.GeoDataFrame(df, geometry='geometry', crs="EPSG:4326")
# Ensure the neighbourhoods GeoDataFrame is in the same CRS as our points
if neighborhoods.crs != gdf.crs:
neighborhoods = neighborhoods.to_crs(gdf.crs)
# Perform a spatial join: assign each point the attributes of the neighbourhood it falls in.
# Here we assume the neighbourhood name is stored in a column called 'neighbourhood' in the GeoJSON.
# Adjust the column name accordingly if different.
gdf = gpd.sjoin(gdf, neighborhoods[['AREA_NAME', 'geometry']], how="left", predicate="within")
# Add the 'neighbourhood' column back to the original DataFrame
df['AREA_NAME'] = gdf['AREA_NAME']
df.drop(columns='geometry', inplace=True)
In [8]:
# Drop NA values
df = df.dropna()
df.shape
Out[8]:
(2749, 15)
In [9]:
# Check the unique values from each column to get the sense of the dataset
for col in df.columns:
print(f"Unique values in '{col}':")
print(df[col].unique())
print("-" * 40)
Unique values in 'id_': [219129 757581 404196 ... 476334 413482 484367] ---------------------------------------- Unique values in 'ward': ['W13' 'W10' 'W11'] ---------------------------------------- Unique values in 'beds': [3. 1. 2. 0.] ---------------------------------------- Unique values in 'baths': [3 1 2] ---------------------------------------- Unique values in 'DEN': ['No' 'YES'] ---------------------------------------- Unique values in 'size': ['1500-1999 sqft' '500-999 sqft' '2000-2499 sqft' '1000-1499 sqft' '0-499 sqft' '5500-3999 sqft' '2500-2999 sqft' '3000-3499 sqft' '4000+ sqft'] ---------------------------------------- Unique values in 'parking': ['N' 'Yes'] ---------------------------------------- Unique values in 'exposure': ['No' 'We' 'S' 'E'] ---------------------------------------- Unique values in 'D_mkt': [ 16. 23. 12. 10. 1. 4. 5. 13. 19. 37. 3. 54. 2. 6. 24. 27. 0. 28. 33. 30. 45. 26. 43. 17. 15. 8. 29. 35. 20. 7. 34. 21. 14. 11. 9. 22. 31. 40. 46. 32. 74. 36. 18. 25. 38. 55. 87. 47. 62. 44. 53. 78. 42. 49. 39. 67. 59. 83. 57. 48. 41. 51. 169. 64. 73. 58. 79. 52. 65. 85. 50. 80. 63. 56. 81. 86. 68. 89. 66. 61. 84. 70. 69.] ---------------------------------------- Unique values in 'building_age': [ 9 3 1 25 12 8 4 11 0 19 7 16 14 5 6 13 10 2 15 34 50 23 56 20 29 31 22 21 17 39 24 37 52 18 46 47 33 28 42 32 30 26 48 27 49 58 55 62 36 59 60 61 40 35 45 41 44 43 75 66 69 65 64 53 38] ---------------------------------------- Unique values in 'maint': [1087. 469. 767. ... 1109. 892. 1183.] ---------------------------------------- Unique values in 'price': [1821000. 613000. 838000. ... 1202000. 1409000. 1386000.] ---------------------------------------- Unique values in 'lt': [43.61799707 43.64896846 43.64104467 ... 43.63682962 43.66931697 43.64853276] ---------------------------------------- Unique values in 'lg': [-79.39238293 -79.39003091 -79.37543576 ... -79.41218711 -79.40804663 -79.35875259] ---------------------------------------- Unique values in 'AREA_NAME': ['St Lawrence-East Bayfront-The Islands' 'Wellington Place' 'Kensington-Chinatown' 'Cabbagetown-South St.James Town' 'Harbourfront-CityPlace' 'Rosedale-Moore Park' 'South Parkdale' 'Regent Park' 'Palmerston-Little Italy' 'Annex' 'North St.James Town' 'Fort York-Liberty Village' 'Leaside-Bennington' 'Moss Park' 'Bay-Cloverhill' 'Dovercourt Village' 'Trinity-Bellwoods' 'Downtown Yonge East' 'Yonge-Bay Corridor' 'Church-Wellesley' 'University' 'West Queen West'] ----------------------------------------
In [10]:
# Convert boolean-like categorical variables to binary format
df["DEN"] = df["DEN"].map({"YES": 1, "No": 0}).astype(int)
df["parking"] = df["parking"].map({"Yes": 1, "N": 0}).astype(int)
df["exposure"] = df["exposure"].map({"No": "North", "We": "West", "S": "South", "E": "East"})
<ipython-input-10-2a5a80bf2f33>:2: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
df["DEN"] = df["DEN"].map({"YES": 1, "No": 0}).astype(int)
<ipython-input-10-2a5a80bf2f33>:3: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
df["parking"] = df["parking"].map({"Yes": 1, "N": 0}).astype(int)
<ipython-input-10-2a5a80bf2f33>:4: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
df["exposure"] = df["exposure"].map({"No": "North", "We": "West", "S": "South", "E": "East"})
In [11]:
# After doing the pre-processing, unique values of our data types
for col in df.columns:
print(f"Unique values in '{col}':")
print(df[col].unique())
print("-" * 40)
Unique values in 'id_': [219129 757581 404196 ... 476334 413482 484367] ---------------------------------------- Unique values in 'ward': ['W13' 'W10' 'W11'] ---------------------------------------- Unique values in 'beds': [3. 1. 2. 0.] ---------------------------------------- Unique values in 'baths': [3 1 2] ---------------------------------------- Unique values in 'DEN': [0 1] ---------------------------------------- Unique values in 'size': ['1500-1999 sqft' '500-999 sqft' '2000-2499 sqft' '1000-1499 sqft' '0-499 sqft' '5500-3999 sqft' '2500-2999 sqft' '3000-3499 sqft' '4000+ sqft'] ---------------------------------------- Unique values in 'parking': [0 1] ---------------------------------------- Unique values in 'exposure': ['North' 'West' 'South' 'East'] ---------------------------------------- Unique values in 'D_mkt': [ 16. 23. 12. 10. 1. 4. 5. 13. 19. 37. 3. 54. 2. 6. 24. 27. 0. 28. 33. 30. 45. 26. 43. 17. 15. 8. 29. 35. 20. 7. 34. 21. 14. 11. 9. 22. 31. 40. 46. 32. 74. 36. 18. 25. 38. 55. 87. 47. 62. 44. 53. 78. 42. 49. 39. 67. 59. 83. 57. 48. 41. 51. 169. 64. 73. 58. 79. 52. 65. 85. 50. 80. 63. 56. 81. 86. 68. 89. 66. 61. 84. 70. 69.] ---------------------------------------- Unique values in 'building_age': [ 9 3 1 25 12 8 4 11 0 19 7 16 14 5 6 13 10 2 15 34 50 23 56 20 29 31 22 21 17 39 24 37 52 18 46 47 33 28 42 32 30 26 48 27 49 58 55 62 36 59 60 61 40 35 45 41 44 43 75 66 69 65 64 53 38] ---------------------------------------- Unique values in 'maint': [1087. 469. 767. ... 1109. 892. 1183.] ---------------------------------------- Unique values in 'price': [1821000. 613000. 838000. ... 1202000. 1409000. 1386000.] ---------------------------------------- Unique values in 'lt': [43.61799707 43.64896846 43.64104467 ... 43.63682962 43.66931697 43.64853276] ---------------------------------------- Unique values in 'lg': [-79.39238293 -79.39003091 -79.37543576 ... -79.41218711 -79.40804663 -79.35875259] ---------------------------------------- Unique values in 'AREA_NAME': ['St Lawrence-East Bayfront-The Islands' 'Wellington Place' 'Kensington-Chinatown' 'Cabbagetown-South St.James Town' 'Harbourfront-CityPlace' 'Rosedale-Moore Park' 'South Parkdale' 'Regent Park' 'Palmerston-Little Italy' 'Annex' 'North St.James Town' 'Fort York-Liberty Village' 'Leaside-Bennington' 'Moss Park' 'Bay-Cloverhill' 'Dovercourt Village' 'Trinity-Bellwoods' 'Downtown Yonge East' 'Yonge-Bay Corridor' 'Church-Wellesley' 'University' 'West Queen West'] ----------------------------------------
Exploratory Data Analysis¶
Distribution of House Prices¶
In [12]:
# Set a modern style
sns.set_theme(style="whitegrid")
# Create an enhanced histogram with KDE (Kernel Density Estimation)
plt.figure(figsize=(12, 6))
ax = sns.histplot(df["price"], bins=30, kde=True, color="dodgerblue", edgecolor="black", alpha=0.7)
# Format x-axis for price display
ax.xaxis.set_major_formatter(mticker.FuncFormatter(lambda x, _: f'${x:,.0f}'))
# Customize appearance
plt.xlabel("Price ($)", fontsize=13, fontweight="bold")
plt.ylabel("Frequency", fontsize=13, fontweight="bold")
plt.title("Distribution of Property Prices", fontsize=15, fontweight="bold", pad=15)
# Remove top and right borders for a cleaner look
sns.despine()
# Show the plot
plt.show()
Distribution of Property Prices by Bedrooms and Den Presence¶
In [13]:
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
# Ensure 'DEN' column exists and fill NaN values if necessary
if "DEN" in df.columns:
df["DEN"] = df["DEN"].fillna(0) # Replace missing values with 0
df["DEN"] = df["DEN"].map({1: "With Den", 0: "No Den"}) # Convert to readable format
else:
raise KeyError("Column 'DEN' not found in DataFrame")
# Set an enhanced modern style
sns.set_theme(style="whitegrid")
# Create the violin plot
plt.figure(figsize=(12, 6))
ax = sns.violinplot(
x="beds", y="price", hue="DEN", data=df, palette="coolwarm", split=True, inner="quartile", linewidth=1.5, saturation=0.9
)
# Format y-axis for price display
ax.yaxis.set_major_formatter(mticker.FuncFormatter(lambda x, _: f'${x:,.0f}'))
# Customize appearance
plt.xlabel("Number of Bedrooms", fontsize=13, fontweight="bold")
plt.ylabel("Price ($)", fontsize=13, fontweight="bold")
plt.title("Distribution of Property Prices by Bedrooms and Den Presence", fontsize=15, fontweight="bold", pad=15)
# Improve x-axis readability
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
# Add legend with better positioning
plt.legend(title="Den Presence", title_fontsize="12", loc="upper left", bbox_to_anchor=(1, 1))
# Remove top and right borders for a cleaner look
sns.despine()
# Show the plot
plt.show()
Distribution of Exposure vs Price¶
In [14]:
# Set a modern style
sns.set_theme(style="whitegrid")
# Create an enhanced boxplot
plt.figure(figsize=(10, 6))
ax = sns.boxplot(
x="exposure", y="price", data=df, palette="coolwarm", linewidth=1.5, notch=True, width=0.6
)
# Format y-axis for price display
ax.yaxis.set_major_formatter(mticker.FuncFormatter(lambda x, _: f'${x:,.0f}'))
# Customize appearance
plt.xlabel("Exposure", fontsize=13, fontweight="bold")
plt.ylabel("Price ($)", fontsize=13, fontweight="bold")
plt.title("Property Prices by Exposure Direction", fontsize=15, fontweight="bold", pad=15)
# Improve x-axis readability
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
# Remove top and right borders for a cleaner look
sns.despine()
# Show the plot
plt.show()
<ipython-input-14-9da728f6007c>:6: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. ax = sns.boxplot(
Distribution of Day in the Market vs Price¶
In [15]:
# Set a modern style
sns.set_theme(style="whitegrid")
# Create an enhanced scatter plot
plt.figure(figsize=(10, 6))
ax = sns.scatterplot(
x=df["D_mkt"], y=df["price"], alpha=0.6, color="purple", edgecolor="black"
)
# Add a trend line using regression (without confidence interval for clarity)
sns.regplot(
x=df["D_mkt"], y=df["price"], scatter=False, color="black", line_kws={"linestyle": "dashed"}
)
# Format y-axis for price display
ax.yaxis.set_major_formatter(mticker.FuncFormatter(lambda x, _: f'${x:,.0f}'))
# Customize appearance
plt.xlabel("Days on Market", fontsize=13, fontweight="bold")
plt.ylabel("Price ($)", fontsize=13, fontweight="bold")
plt.title("Days on Market vs Property Price", fontsize=15, fontweight="bold", pad=15)
# Improve x-axis readability
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
# Remove top and right borders for a cleaner look
sns.despine()
# Show the plot
plt.show()
Distribution of Age of Building vs Property Price¶
In [16]:
# Set a modern style
sns.set_theme(style="whitegrid")
# Create an enhanced scatter plot for Building Age vs Price
plt.figure(figsize=(10, 6))
ax = sns.scatterplot(
x=df["building_age"], y=df["price"], alpha=0.6, color="red", edgecolor="black"
)
# Add a trend line using regression (without confidence interval for clarity)
sns.regplot(
x=df["building_age"], y=df["price"], scatter=False, color="black", line_kws={"linestyle": "dashed"}
)
# Format y-axis for price display
ax.yaxis.set_major_formatter(mticker.FuncFormatter(lambda x, _: f'${x:,.0f}'))
# Customize appearance
plt.xlabel("Building Age (Years)", fontsize=13, fontweight="bold")
plt.ylabel("Price ($)", fontsize=13, fontweight="bold")
plt.title("Building Age vs Property Price", fontsize=15, fontweight="bold", pad=15)
# Improve x-axis readability
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
# Remove top and right borders for a cleaner look
sns.despine()
# Show the plot
plt.show()
In [17]:
# Set a modern style
sns.set_theme(style="whitegrid")
# Create FacetGrid for Building Age vs Price divided by number of bedrooms
g = sns.FacetGrid(df, col="beds", col_wrap=3, height=4, sharex=True, sharey=True)
g.map_dataframe(sns.scatterplot, x="building_age", y="price", alpha=0.6, color="red", edgecolor="black")
# Add regression lines to each facet
g.map_dataframe(sns.regplot, x="building_age", y="price", scatter=False, color="black", line_kws={"linestyle": "dashed"})
# Format y-axis for price display
for ax in g.axes.flat:
ax.yaxis.set_major_formatter(mticker.FuncFormatter(lambda x, _: f'${x:,.0f}'))
# Customize titles and labels
g.set_axis_labels("Building Age (Years)", "Price ($)")
g.set_titles(col_template="Beds: {col_name}")
# Improve readability
plt.subplots_adjust(top=0.85)
g.fig.suptitle("Building Age vs Property Price Across Different Bedroom Counts", fontsize=15, fontweight="bold")
# Show the plot
plt.show()
Distribution of Parking vs Price¶
In [18]:
# Set a modern style
sns.set_theme(style="whitegrid")
# Convert 'parking' column to a readable format
df["parking"] = df["parking"].map({1: "With Parking", 0: "No Parking"})
# Create an enhanced boxplot
plt.figure(figsize=(8, 6))
ax = sns.boxplot(
x="parking", y="price", data=df, palette="Set2", linewidth=1.5, notch=True, width=0.6
)
# Format y-axis for price display
ax.yaxis.set_major_formatter(mticker.FuncFormatter(lambda x, _: f'${x:,.0f}'))
# Customize appearance
plt.xlabel("Parking Availability", fontsize=13, fontweight="bold")
plt.ylabel("Price ($)", fontsize=13, fontweight="bold")
plt.title("Impact of Parking Availability on Property Prices", fontsize=15, fontweight="bold", pad=15)
# Improve x-axis readability
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
# Remove top and right borders for a cleaner look
sns.despine()
# Show the plot
plt.show()
<ipython-input-18-e15dd4ec88be>:9: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. ax = sns.boxplot(
Distribution of Property Prices by Size¶
In [19]:
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
size_order = [
"0-499 sqft", "500-999 sqft", "1000-1499 sqft", "1500-1999 sqft",
"2000-2499 sqft", "2500-2999 sqft", "3000-3499 sqft", "4000+ sqft"
]
# Define color mapping for each size category
size_colors = {
"0-499 sqft": "red",
"500-999 sqft": "blue",
"1000-1499 sqft": "green",
"1500-1999 sqft": "purple",
"2000-2499 sqft": "orange",
"2500-2999 sqft": "pink",
"3000-3499 sqft": "cyan",
"4000+ sqft": "brown"
}
# Set a modern style
sns.set_theme(style="whitegrid")
# Create the violin plot with an enhanced color palette
plt.figure(figsize=(12, 6))
ax = sns.violinplot(
x="size", y="price", data=df, order=size_order, palette="coolwarm", inner="quartile", linewidth=1.2
)
# Format y-axis to display full price values (e.g., $500K, $1M, etc.)
ax.yaxis.set_major_formatter(mticker.FuncFormatter(lambda x, _: f'${x:,.0f}'))
# Customize the appearance
plt.xlabel("Size Category", fontsize=12, fontweight="bold")
plt.ylabel("Price ($)", fontsize=12, fontweight="bold")
plt.title("Distribution of Property Prices by Size", fontsize=14, fontweight="bold", pad=15)
# Rotate x-axis labels for better readability
plt.xticks(rotation=45, fontsize=10)
plt.yticks(fontsize=10)
# Remove top and right borders for a cleaner look
sns.despine()
# Show the plot
plt.show()
<ipython-input-19-ac7d038b7078>:27: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. ax = sns.violinplot(
High Correlation between Beds and Baths¶
In [20]:
import numpy as np
correlation = df["beds"].corr(df["baths"])
print(f"Correlation between beds and baths: {correlation:.2f}")
Correlation between beds and baths: 0.75
Scatterplot of Maintence Cost vs Property Price¶
In [21]:
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
# Set a modern style
sns.set_theme(style="whitegrid")
# Create the scatter plot
plt.figure(figsize=(10, 6))
ax = sns.scatterplot(x="maint", y="price", data=df, alpha=0.6, color="dodgerblue")
# Format y-axis to display full price values
ax.yaxis.set_major_formatter(mticker.FuncFormatter(lambda x, _: f'${x:,.0f}'))
# Customize appearance
plt.xlabel("Maintenance Cost ($)", fontsize=12, fontweight="bold")
plt.ylabel("Price ($)", fontsize=12, fontweight="bold")
plt.title("Scatter Plot of Maintenance Cost vs Property Price", fontsize=14, fontweight="bold", pad=15)
# Improve grid visibility
plt.grid(True, linestyle="--", alpha=0.6)
# Show the plot
plt.show()
In [22]:
import folium
from folium.plugins import MarkerCluster
# Initialize a map centered around Toronto
m = folium.Map(location=[df["lt"].mean(), df["lg"].mean()], zoom_start=12)
# Define a price-based color scale
def price_color(price):
if price < 500000:
return "green"
elif 500000 <= price < 1000000:
return "blue"
elif 1000000 <= price < 1500000:
return "orange"
else:
return "red"
# Add markers for each property
marker_cluster = MarkerCluster().add_to(m)
for _, row in df.iterrows():
folium.Marker(
location=[row["lt"], row["lg"]],
popup=f"Price: ${row['price']:,.0f}\nBeds: {row['beds']}\nBaths: {row['baths']}",
icon=folium.Icon(color=price_color(row["price"]))
).add_to(marker_cluster)
# Display map
m.save("property_map.html") # Saves map as an HTML file (open in browser)
m
Out[22]:
Make this Notebook Trusted to load map: File -> Trust Notebook
In [23]:
# # 1. Load your CSV data (ensure "real-estate-data.csv" is in your working directory)
# df = pd.read_csv("real-estate-data.csv")
# df = df.dropna(subset=["lt", "lg", "price", "ward"])
# 2. Define an elevation scale for the bars
elevation_scale = 0.0002
# 3. Define a function to map price to a color for the 3D bars
def price_to_color(price):
min_price, max_price = 500000, 2500000
ratio = (price - min_price) / (max_price - min_price + 1e-9)
ratio = max(0, min(ratio, 1))
r = int(150 + ratio * (255 - 150))
g = int(ratio * 255)
b = int(150 - ratio * 150)
return [r, g, b]
df["bar_color"] = df["price"].apply(price_to_color)
# 4. Create a ColumnLayer for property data (bars colored by price)
column_layer = pdk.Layer(
"ColumnLayer",
data=df,
get_position=["lg", "lt"], # [longitude, latitude]
get_elevation="price", # Height based on price
elevation_scale=elevation_scale,
radius=25, # Skinnier columns
get_fill_color="bar_color", # Color by price
pickable=True,
extruded=True,
auto_highlight=True,
)
# 5. Compute maximum elevation so we can position ward labels above the bars
max_price_val = df["price"].max()
max_elevation = max_price_val * elevation_scale
label_elevation = max_elevation + 50 # Place label 50 units above the tallest bar
# 6. Group by ward and compute a convex hull (or fallback rectangle) for each ward
ward_regions = []
# Define a color mapping for the wards:
ward_color_map = {
"W10": [255, 99, 71], # Tomato (e.g., Spadina-Fort York)
"W11": [60, 179, 113], # Medium Sea Green (e.g., University-Rosedale)
"W13": [65, 105, 225] # Royal Blue (e.g., Toronto Centre)
}
unique_wards = df["ward"].unique()
for ward in unique_wards:
group = df[df["ward"] == ward]
points = list(zip(group["lg"], group["lt"]))
if len(points) < 3:
# Fallback: create a rectangle covering the points
min_lg = group["lg"].min()
max_lg = group["lg"].max()
min_lt = group["lt"].min()
max_lt = group["lt"].max()
polygon = [
[min_lg, min_lt],
[min_lg, max_lt],
[max_lg, max_lt],
[max_lg, min_lt],
[min_lg, min_lt] # Close the polygon
]
centroid_lg = (min_lg + max_lg) / 2
centroid_lt = (min_lt + max_lt) / 2
else:
convex_hull = MultiPoint(points).convex_hull
polygon = list(convex_hull.exterior.coords)
centroid_lg = convex_hull.centroid.x
centroid_lt = convex_hull.centroid.y
fill_color = ward_color_map.get(ward, [200, 200, 200])
ward_regions.append({
"ward": ward,
"polygon": polygon,
"fill_color": fill_color,
"center_lg": centroid_lg,
"center_lt": centroid_lt
})
df_wards = pd.DataFrame(ward_regions)
# 7. Create a PolygonLayer for the ward regions (colored only where data exists)
polygon_layer = pdk.Layer(
"PolygonLayer",
data=df_wards,
get_polygon="polygon",
get_fill_color="fill_color",
get_line_color=[255, 255, 255],
pickable=False, # Disable tooltips on polygons
stroked=True,
extruded=False, # Flat polygons
opacity=0.3, # Semi-transparent for a polished look
)
# 8. Create a TextLayer for ward labels, positioned at each ward's centroid at a fixed elevation
text_layer = pdk.Layer(
"TextLayer",
data=df_wards,
get_position=["center_lg", "center_lt", label_elevation],
get_text="ward",
get_color=[255, 255, 255],
get_size=32, # Large font for presentation
get_alignment_baseline="'bottom'",
pickable=False,
)
# 9. Set the initial view centered on your data points
center_lat = df["lt"].mean()
center_lg = df["lg"].mean()
view_state = pdk.ViewState(
latitude=center_lat,
longitude=center_lg,
zoom=12,
pitch=50, # Tilt for a good 3D perspective
)
# 10. Create the Deck with three layers:
# - PolygonLayer: colored ward regions (covering only the area with data)
# - ColumnLayer: 3D property bars (colored by price)
# - TextLayer: ward labels above the bars
deck = pdk.Deck(
layers=[polygon_layer, column_layer, text_layer],
initial_view_state=view_state,
map_style="https://basemaps.cartocdn.com/gl/dark-matter-gl-style/style.json",
tooltip={"text": "Price: {price}\nBeds: {beds}\nBaths: {baths}\nWard: {ward}"}
)
# 11. Export the map to an HTML file
deck.to_html("geospatial.html")
# 12. Inject a custom HTML legend into the exported HTML file
legend_html = """
<div id="legend" style="position: absolute; top: 20px; right: 20px; background: rgba(255,255,255,0.8); padding: 10px; border-radius: 5px; font-family: sans-serif; z-index: 9999;">
<div><span style="display:inline-block; width:12px; height:12px; background: rgb(255,99,71); margin-right:5px;"></span>W10 Spadina-Fort York</div>
<div><span style="display:inline-block; width:12px; height:12px; background: rgb(60,179,113); margin-right:5px;"></span>W11 University-Rosedale</div>
<div><span style="display:inline-block; width:12px; height:12px; background: rgb(65,105,225); margin-right:5px;"></span>W13 Toronto Centre</div>
</div>
"""
html_file = "geospatial.html"
with open(html_file, "r", encoding="utf-8") as f:
html_content = f.read()
# Insert the legend right after the <body> tag
modified_html = html_content.replace("<body>", "<body>" + legend_html, 1)
with open("geospatial_with_legend.html", "w", encoding="utf-8") as f:
f.write(modified_html)
print("Map with legend saved as 'geospatial_with_legend.html'")
Map with legend saved as 'geospatial_with_legend.html'
In [24]:
import folium
import branca.colormap as cm
# Compute the average condo price for each neighbourhood from the DataFrame (df)
avg_price = df.groupby("AREA_NAME")["price"].mean().reset_index()
# Merge the average prices into the neighbourhood GeoDataFrame
# (Assuming that the common column is named "neighbourhood")
neigh = neighborhoods.merge(avg_price, on="AREA_NAME", how="left")
# Determine the min and max average price for the color scale
min_price = neigh["price"].min()
max_price = neigh["price"].max()
# Create a linear colormap ranging from red (low price) to blue (high price)
colormap = cm.LinearColormap(colors=["red", "blue"], vmin=min_price, vmax=max_price)
colormap.caption = "Average Condo Price (C$)"
# Initialize a Folium map centered on Toronto (latitude 43.65, longitude -79.38)
m = folium.Map(location=[43.65, -79.38], zoom_start=12)
# Add the neighbourhood polygons as a GeoJson layer
folium.GeoJson(
neigh,
style_function=lambda feature: {
"fillColor": colormap(feature["properties"]["price"])
if feature["properties"]["price"] is not None else "gray",
"color": "black",
"weight": 1,
"fillOpacity": 0.7,
},
tooltip=folium.features.GeoJsonTooltip(
fields=["AREA_NAME", "price"],
aliases=["Neighbourhood:", "Avg Price:"],
localize=True,
),
).add_to(m)
# Add the colormap to the map as a legend
colormap.add_to(m)
# Display the map (in a Jupyter Notebook this will render the interactive map)
m
Out[24]:
Make this Notebook Trusted to load map: File -> Trust Notebook
Basic Model¶
Simple Linear Model¶
In [25]:
model = smf.ols('price ~ ward + beds + baths + DEN + size + parking + exposure + D_mkt + building_age + maint + lt + lg', data=df).fit()
print(model.summary())
OLS Regression Results
==============================================================================
Dep. Variable: price R-squared: 0.920
Model: OLS Adj. R-squared: 0.920
Method: Least Squares F-statistic: 1431.
Date: Sun, 02 Mar 2025 Prob (F-statistic): 0.00
Time: 15:44:10 Log-Likelihood: -36798.
No. Observations: 2749 AIC: 7.364e+04
Df Residuals: 2726 BIC: 7.378e+04
Df Model: 22
Covariance Type: nonrobust
===========================================================================================
coef std err t P>|t| [0.025 0.975]
-------------------------------------------------------------------------------------------
Intercept 3.199e+07 3.03e+07 1.055 0.291 -2.74e+07 9.14e+07
ward[T.W11] -6528.5448 1.19e+04 -0.547 0.585 -2.99e+04 1.69e+04
ward[T.W13] -2.337e+04 1.3e+04 -1.795 0.073 -4.89e+04 2160.646
DEN[T.With Den] 1.836e+04 6494.343 2.828 0.005 5628.526 3.11e+04
size[T.1000-1499 sqft] 3.172e+05 1.29e+04 24.646 0.000 2.92e+05 3.42e+05
size[T.1500-1999 sqft] 6.028e+05 1.84e+04 32.776 0.000 5.67e+05 6.39e+05
size[T.2000-2499 sqft] 8.711e+05 2.56e+04 33.990 0.000 8.21e+05 9.21e+05
size[T.2500-2999 sqft] 9.987e+05 4.03e+04 24.756 0.000 9.2e+05 1.08e+06
size[T.3000-3499 sqft] 1.19e+06 4.82e+04 24.667 0.000 1.1e+06 1.28e+06
size[T.4000+ sqft] 1.759e+06 6.4e+04 27.494 0.000 1.63e+06 1.88e+06
size[T.500-999 sqft] 9.659e+04 8204.931 11.772 0.000 8.05e+04 1.13e+05
size[T.5500-3999 sqft] 1.632e+06 4.87e+04 33.539 0.000 1.54e+06 1.73e+06
parking[T.With Parking] 4577.7335 6136.838 0.746 0.456 -7455.590 1.66e+04
exposure[T.North] -2909.4908 9509.430 -0.306 0.760 -2.16e+04 1.57e+04
exposure[T.South] -9786.8494 9249.280 -1.058 0.290 -2.79e+04 8349.459
exposure[T.West] -2.368e+04 1.09e+04 -2.174 0.030 -4.5e+04 -2325.346
beds 7.309e+04 7040.570 10.382 0.000 5.93e+04 8.69e+04
baths -4895.5616 7338.499 -0.667 0.505 -1.93e+04 9494.021
D_mkt 72.4343 220.141 0.329 0.742 -359.226 504.095
building_age -83.8346 308.142 -0.272 0.786 -688.050 520.380
maint 550.1355 11.387 48.314 0.000 527.808 572.463
lt -3.53e+05 3.99e+05 -0.886 0.376 -1.13e+06 4.28e+05
lg 2.062e+05 2.15e+05 0.957 0.339 -2.16e+05 6.28e+05
==============================================================================
Omnibus: 382.375 Durbin-Watson: 2.016
Prob(Omnibus): 0.000 Jarque-Bera (JB): 3646.035
Skew: 0.323 Prob(JB): 0.00
Kurtosis: 8.605 Cond. No. 9.11e+06
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 9.11e+06. This might indicate that there are
strong multicollinearity or other numerical problems.
In [25]: